data <-read_csv("C:/Users/Sohana/OneDrive - University of Saskatchewan/ML 898/data/can_path_data.csv")
#glimpse(data)
data <- data |> dplyr::select(ID,
SDC_AGE_CALC,
PA_TOTAL_SHORT,
PM_BMI_SR,
SDC_EDU_LEVEL_AGE,
PSE_ADULT_WRK_DURATION,
PM_WAIST_HIP_RATIO_SR,
PA_SIT_AVG_TIME_DAY,
SLE_TIME,
NUT_VEG_QTY,
NUT_FRUITS_QTY,
NUT_JUICE_QTY,
ALC_CUR_FREQ)
rm_covsum(data=data,
covs=c('SDC_AGE_CALC','PA_TOTAL_SHORT', 'PM_BMI_SR', 'SDC_EDU_LEVEL_AGE', 'PSE_ADULT_WRK_DURATION', 'PM_WAIST_HIP_RATIO_SR', 'PA_SIT_AVG_TIME_DAY', 'SLE_TIME', 'NUT_VEG_QTY', 'NUT_FRUITS_QTY', 'NUT_JUICE_QTY', 'ALC_CUR_FREQ'))
| n=41187 | |
|---|---|
| SDC AGE CALC | |
| Mean (sd) | 51.5 (10.8) |
| Median (Min,Max) | 52 (30, 74) |
| PA TOTAL SHORT | |
| Mean (sd) | 2574.1 (2656.2) |
| Median (Min,Max) | 1782 (0, 19278) |
| Missing | 6763 |
| PM BMI SR | |
| Mean (sd) | 27.5 (6.2) |
| Median (Min,Max) | 26.6 (8.9, 69.4) |
| Missing | 11976 |
| SDC EDU LEVEL AGE | |
| Mean (sd) | 25.4 (9.2) |
| Median (Min,Max) | 23 (-7, 73) |
| Missing | 2817 |
| PSE ADULT WRK DURATION | |
| Mean (sd) | 6.6 (9.4) |
| Median (Min,Max) | 2 (0, 51) |
| Missing | 5731 |
| PM WAIST HIP RATIO SR | |
| Mean (sd) | 0.9 (0.1) |
| Median (Min,Max) | 0.9 (0.3, 2.4) |
| Missing | 17734 |
| PA SIT AVG TIME DAY | |
| Mean (sd) | 391.1 (366.9) |
| Median (Min,Max) | 360 (0, 9999) |
| Missing | 11257 |
| SLE TIME | |
| Mean (sd) | 435.9 (75.1) |
| Median (Min,Max) | 430 (0, 1390) |
| Missing | 2792 |
| NUT VEG QTY | |
| Mean (sd) | 2.7 (1.7) |
| Median (Min,Max) | 2 (0, 35) |
| Missing | 2549 |
| NUT FRUITS QTY | |
| Mean (sd) | 2.1 (1.4) |
| Median (Min,Max) | 2 (0, 25) |
| Missing | 2426 |
| NUT JUICE QTY | |
| Mean (sd) | 0.8 (1.1) |
| Median (Min,Max) | 1 (0, 44) |
| Missing | 3583 |
| ALC CUR FREQ | |
| Mean (sd) | 3.0 (3.3) |
| Median (Min,Max) | 3 (-7, 7) |
| Missing | 1683 |
## Two variables had variables coded as -7. Converting those to missing.
data <- data %>% mutate(SDC_EDU_LEVEL_AGE = if_else(SDC_EDU_LEVEL_AGE < 0, NA_real_, SDC_EDU_LEVEL_AGE))
data <- data %>% mutate(ALC_CUR_FREQ = if_else(ALC_CUR_FREQ < 0, NA_real_, ALC_CUR_FREQ))
data <- data %>%
mutate(PA_SIT_AVG_TIME_DAY = case_when(
PA_SIT_AVG_TIME_DAY > 360 ~ 360,
TRUE ~ PA_SIT_AVG_TIME_DAY
))
### Drop NA
data <- drop_na(data)
rm_covsum(data=data,
covs=c('SDC_AGE_CALC','PA_TOTAL_SHORT', 'PM_BMI_SR', 'SDC_EDU_LEVEL_AGE', 'PSE_ADULT_WRK_DURATION', 'PM_WAIST_HIP_RATIO_SR', 'PA_SIT_AVG_TIME_DAY', 'SLE_TIME', 'NUT_VEG_QTY', 'NUT_FRUITS_QTY', 'NUT_JUICE_QTY', 'ALC_CUR_FREQ'))
| n=13242 | |
|---|---|
| SDC AGE CALC | |
| Mean (sd) | 51.8 (10.5) |
| Median (Min,Max) | 52 (30, 74) |
| PA TOTAL SHORT | |
| Mean (sd) | 2757.5 (2658.1) |
| Median (Min,Max) | 2010 (0, 19278) |
| PM BMI SR | |
| Mean (sd) | 27.3 (5.8) |
| Median (Min,Max) | 26.5 (8.9, 69.4) |
| SDC EDU LEVEL AGE | |
| Mean (sd) | 25.3 (8.3) |
| Median (Min,Max) | 23 (1, 73) |
| PSE ADULT WRK DURATION | |
| Mean (sd) | 6.5 (9.3) |
| Median (Min,Max) | 2 (0, 51) |
| PM WAIST HIP RATIO SR | |
| Mean (sd) | 0.9 (0.1) |
| Median (Min,Max) | 0.9 (0.4, 2.4) |
| PA SIT AVG TIME DAY | |
| Mean (sd) | 303.1 (78.0) |
| Median (Min,Max) | 360 (0, 360) |
| SLE TIME | |
| Mean (sd) | 435.7 (69.2) |
| Median (Min,Max) | 430 (0, 960) |
| NUT VEG QTY | |
| Mean (sd) | 2.7 (1.6) |
| Median (Min,Max) | 2 (0, 30) |
| NUT FRUITS QTY | |
| Mean (sd) | 2.1 (1.3) |
| Median (Min,Max) | 2 (0, 21) |
| NUT JUICE QTY | |
| Mean (sd) | 0.8 (1.0) |
| Median (Min,Max) | 1 (0, 15) |
| ALC CUR FREQ | |
| Mean (sd) | 3.7 (2.1) |
| Median (Min,Max) | 4 (0, 7) |
data %>%
correlate() %>%
rearrange() %>%
shave() %>%
rplot(print_cor=TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
pca_recipe <- recipe(~., data = data) %>%
update_role(ID, new_role = "id") %>%
step_scale(all_predictors()) %>%
step_center(all_predictors()) %>%
step_pca(all_predictors(), id = "pca_id")
pca_recipe
pca_prepared <- prep(pca_recipe, retain = TRUE)
pca_prepared
pca_baked <- bake(pca_prepared, data)
pca_baked
## # A tibble: 13,242 × 6
## ID PC1 PC2 PC3 PC4 PC5
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SYN_58624 0.746 -1.57 0.418 -0.781 0.118
## 2 SYN_58628 0.291 0.166 -1.24 -1.18 -0.837
## 3 SYN_586217 1.02 0.482 -0.381 -0.518 -0.797
## 4 SYN_586219 -0.170 -2.79 -0.830 0.870 -0.0621
## 5 SYN_586221 -0.144 0.509 0.441 -1.08 -1.91
## 6 SYN_586230 0.0477 -1.22 -1.16 -0.107 0.0144
## 7 SYN_586246 -1.21 0.275 -0.534 -0.441 -0.222
## 8 SYN_586247 -1.41 3.44 -1.13 0.478 -1.16
## 9 SYN_586250 -1.28 -0.973 -1.13 0.145 -0.661
## 10 SYN_586256 -0.228 -1.83 -0.0119 -0.544 -0.349
## # ℹ 13,232 more rows
pca_variables <- tidy(pca_prepared, id = "pca_id", type = "coef")
ggplot(pca_variables) +
geom_point(aes(x = value, y = terms, color = component))+
labs(color = NULL) +
geom_vline(xintercept=0) +
geom_vline(xintercept=-0.2, linetype = 'dashed') +
geom_vline(xintercept=0.2, linetype = 'dashed') +
facet_wrap(~ component) +
theme_minimal()
pca_variances <- tidy(pca_prepared, id = "pca_id", type = "variance")
pca_variance <- pca_variances |> filter(terms == "percent variance")
pca_variance$component <- as.factor(pca_variance$component)
pca_variance$comp <- as.numeric(pca_variance$component)
ggplot(pca_variance, aes(x = component, y = value, group = 1, color = component)) +
geom_point() +
geom_line() +
labs(x = "Principal Components", y = "Variance explained (%)") +
theme_minimal()
pca_variance2 <-ggplot(pca_variance, aes(x = component, y = value)) +
geom_bar(stat = "identity", fill = "lightgreen") +
geom_text(aes(label = paste0(round(value, 2), "%")), vjust = -0.5, size = 3) +
labs(x = "Principal Components", y = "Percentage Explained Variance") +
theme_minimal()
pca_variance2
Interpretation: The bar chart shows that PC1 explains about 13.37% of
the variance, PC2 explains about 12.69% of the variance, PC3 explains
about 9.93% of the variance and PC4 explains about 9.23% of the
variance. The remaining components explain less than 9% of the variance
each.
pca_cummul_variance <- pca_variances |> filter(terms == "cumulative percent variance")
pca_cummul_variance$component <- as.factor(pca_cummul_variance$component)
pca_cummul_variance$comp <- as.numeric(pca_cummul_variance$component)
ggplot(pca_cummul_variance, aes(x = component, y = value, group = 1, color = component)) +
geom_point() +
geom_line() +
labs(x = "Principal Components", y = "Cummulative Variance explained (%)") +
theme_minimal()
Interpretation: The cumulative variance plot shows that the first few components explain a large portion of the variance in the data. PC1 to PC4 explain about 50% of the variance and PC1 to PC7 explain about 80% of the variance. This suggests that we can reduce the dimensionality of the data while retaining a significant amount of information by keeping only the first few principal components.
pca_corr <- pca_baked |> dplyr::select(!(ID))
pca_corr %>%
correlate() %>%
rearrange() %>%
shave() %>%
rplot(print_cor=TRUE)
ggplot(pca_baked, aes(x = PC1, y = PC2)) +
geom_point(alpha = 0.1) +
geom_smooth(method = "lm") +
labs(x = "PC1", y = "PC2") +
theme_minimal()
Interpretation: The correlation plot shows that the principal components are uncorrelated with each other, which is a key property of PCA. The scatter plot of PC1 vs PC2 shows that there is no clear linear relationship between these two components, which is expected since they are uncorrelated. This suggests that the PCA has successfully transformed the original correlated variables into a new set of uncorrelated components.
scatter_3d <- plot_ly(pca_baked, x = ~PC3, y = ~PC4, z = ~PC5, type = "scatter3d", mode = "markers",
marker = list(size = 3)) %>%
layout(title = "3D Scatter Plot",
scene = list(xaxis = list(title = "PC3"),
yaxis = list(title = "PC4"),
zaxis = list(title = "PC5")))
# Display the 3D scatter plot
scatter_3d
Interpretation: The 3D scatter plot of PC3, PC4, and PC5 shows that there is no clear clustering or grouping of the data points in this reduced dimensional space. This suggests that these components may not capture distinct patterns or clusters in the data, and further analysis may be needed to identify meaningful groupings.
set.seed(10)
kmeans_model <- k_means(num_clusters = 4) %>%
set_engine("stats")
kmeans_model
## K Means Cluster Specification (partition)
##
## Main Arguments:
## num_clusters = 4
##
## Computational engine: stats
kmeans_recipe <- recipe( ~., data = data) %>%
update_role(ID, new_role = "id") %>%
step_zv(all_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())
set.seed(10)
kmeans_workflow <- workflow() %>%
add_recipe(kmeans_recipe) %>%
add_model(kmeans_model)
set.seed(10)
kmeans_fit <- kmeans_workflow %>% fit(data=data)
kmeans_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_zv()
## • step_center()
## • step_scale()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K-means clustering with 4 clusters of sizes 4423, 3512, 3301, 2006
##
## Cluster means:
## SDC_AGE_CALC PA_TOTAL_SHORT PM_BMI_SR SDC_EDU_LEVEL_AGE
## 3 -0.7272835 -0.2579418 -0.4554022 -0.19251605
## 4 0.2663898 -0.3231741 0.7748929 -0.01345518
## 1 0.2095763 0.5756346 -0.2792562 0.32719500
## 2 0.7923245 0.1872851 0.1070014 -0.09038765
## PSE_ADULT_WRK_DURATION PM_WAIST_HIP_RATIO_SR PA_SIT_AVG_TIME_DAY SLE_TIME
## 3 -0.4348796 -0.4250043 0.19351286 -0.050096574
## 4 -0.2920426 0.5765997 0.28102106 -0.006145864
## 1 -0.2597760 -0.1763795 -0.53365711 0.006988034
## 2 1.8976305 0.2178487 -0.04050411 0.109717807
## NUT_VEG_QTY NUT_FRUITS_QTY NUT_JUICE_QTY ALC_CUR_FREQ
## 3 -0.2473456 -0.2431350 -0.12233785 -0.3732542
## 4 -0.3657558 -0.3219000 0.12873015 0.1738716
## 1 0.8706258 0.8441120 -0.03233799 0.2058765
## 2 -0.2469552 -0.2893893 0.09758113 0.1797945
##
## Clustering vector:
## [1] 1 2 3 1 2 2 2 4 2 1 1 3 1 3 3 1 3 4 2 2 3 2 1 2 2 2 1 2 2 3 3 3 2 3 4 2
## [37] 1 2 4 3 2 2 3 4 1 2 3 1 4 1 1 4 1 1 3 3 1 1 1 2 1 4 4 2 2 1 1 1 3 2 1 2
## [73] 4 3 4 3 2 1 3 1 1 2 2 1 1 3 2 3 3 1 3 3 1 4 3 2 2 2 3 4 1 3 1 2 4 1 2 2
## [109] 4 2 1 4 2 2 2 2 2 2 1 3 4 2 2 1 3 3 2 1 2 3 1 1 2 2 3 2 1 1 3 1 1 1 2 1
## [145] 1 2 3 2 1 2 1 2 1 2 2 2 4 4 2 1 2 1 1 2 2 2 3 4 3 2 2 2 1 2 3 1 2 2 1 2
## [181] 1 2 3 1 3 4 1 2 1 1 3 4 2 1 3 4 3 2 4 3 3 1 1 1 1 3 4 1 1 3 3 2 3 1 1 3
## [217] 2 2 4 4 2 2 1 1 3 2 4 4 4 2 2 1 4 4 1 1 1 2 1 2 3 2 4 2 3 2 2 4 3 1 2 3
## [253] 1 4 2 1 2 2 3 2 1 2 2 1 1 1 3 4 1 1 1 1 1 3 2 1 1 1 3 2 2 2 4 3 4 2 1 1
## [289] 1 1 1 4 1 4 1 2 2 2 2 2 2 3 1 2 1 1 1 3 2 2 2 3 2 1 2 2 2 2 1 2 2 1 1 2
## [325] 1 4 2 1 4 2 3 4 1 2 1 3 2 1 2 4 1 3 2 4 1 4 2 1 1 1 1 4 1 2 1 2 2 1 1 2
## [361] 3 2 4 1 1 2 3 3 1 1 1 1 2 2 2 3 2 2 4 3 1 4 3 1 2 2 3 1 2 1 1 2 2 1 4 2
## [397] 2 3 2 1 3 3 1 1 1 1 4 4 1 2 3 1 2 2 2 1 1 4 2 2 1 2 4 3 1 3 3 1 1 1 2 3
## [433] 2 2 4 1 2 1 1 1 3 2 1 2 1 1 3 1 1 4 1 1 2 1 1 3 1 1 2 2 4 1 1 1 2 3 4 1
## [469] 1 3 1 1 2 3 2 1 4 1 2 1 4 2 3 3 3 4 1 4 1 4 3 4 1 1 3 1 1 2 1 2 2 2 3 3
## [505] 3 2 2 1 1 1 1 3 2 3 1 1 1 1 1 1 1 4 2 1 3 1 2 1 2 2 2 3 3 1 1 1 1 2 4 1
## [541] 3 2 2 1 2 1 4 3 1 1 3 2 2 1 2 1 2 4 1 2 2 3 4 3 2 4 2 1 2 3 3 3 3 2 3 1
## [577] 1 2 1 4 3 2 3 2 4 1 1 1 2 3 4 1 4 1 2 2 2 3 1 2 2 1 4 3 1 1 3 3 2 1 1 1
## [613] 2 1 2 1 1 1 3 1 4 2 3 4 3 2 3 1 2 2 1 1 2 2 1 4 3 1 3 2 4 2 2 1 3 3 2 1
## [649] 3 4 2 3 1 1 1 2 1 2 3 1 1 4 2 1 1 2 2 2 3 4 1 3 3 2 1 3 1 1 3 2 2 1 2 3
## [685] 2 1 4 3 1 3 1 3 3 3 1 2 1 2 4 2 4 2 4 1 2 3 4 1 1 1 2 1 1 2 1 2 1 1 2 3
## [721] 4 4 1 3 1 2 2 3 2 1 2 2 1 3 1 1 3 2 1 2 3 4 1 1 2 3 1 2 2 3 2 1 3 1 4 1
## [757] 1 1 3 2 3 1 2 4 3 1 1 2 3 1 1 1 2 3 1 1 1 4 1 1 3 2 2 2 1 3 3 2 2 2 3 3
## [793] 3 2 1 1 2 3 1 2 3 1 2 2 1 2 2 1 4 2 4 4 1 2 2 3 1 1 4 4 2 3 1 2 1 2 2 3
## [829] 2 1 3 3 2 1 1 2 1 3 2 4 3 1 2 1 4 2 3 1 2 2 3 2 1 1 3 4 3 1 2 2 2 4 1 1
## [865] 3 2 2 2 2 2 1 1 2 3 2 3 3 2 3 2 2 2 2 2 3 2 2 3 2 1 4 2 3 2 1 3 4 4 1 3
## [901] 2 1 2 2 4 1 3 3 4 4 2 3 1 2 3 3 1 3 1 1 2 1 2 2 1 4 3 2 2 4 1 3 3 1 3 3
## [937] 1 2 1 3 1 2 3 2 1 2 2 3 1 1 1 2 2 4 3 2 2 3 2 1 4 1 1 1 2 1 3 2 4 4 1 1
## [973] 3 2 1 2 4 1 3 2 4 2 4 1 1 2 1 3 1 2 2 4 4 4 3 3 1 1 2 3 2 3 1 3 1 3 1 2
## [1009] 2 2 1 1 2 2 3 1 1 1 2 1 1 2 1 4 3 1 2 1 1 1 2 1 3 4 1 2 4 2 1 3 2 4 1 4
## [1045] 1 3 2 2 3 2 2 3 3 1 1 1 3 3 3 2 2 1 1 1 2 3 2 4 2 3 3 1 4 2 1 1 1 2 1 1
##
## ...
## and 347 more lines.
clusters <- kmeans_fit %>% extract_cluster_assignment()
clusters <- as.data.frame(clusters)
names(clusters) <- c("cluster")
data$clusters <- clusters$cluster
summary(data$clusters)
## Cluster_1 Cluster_2 Cluster_3 Cluster_4
## 4423 3512 3301 2006
data %>%
dplyr::select(c("SDC_AGE_CALC", "PA_TOTAL_SHORT", "PM_BMI_SR", "SDC_EDU_LEVEL_AGE", "PSE_ADULT_WRK_DURATION", "PA_SIT_AVG_TIME_DAY", "ALC_CUR_FREQ", "clusters")) %>%
ggpairs(aes(fill = clusters, color = clusters))
centroids <- extract_centroids(kmeans_fit)
centroids_long <- centroids %>% pivot_longer(cols=c("SDC_AGE_CALC", "PA_TOTAL_SHORT", "PM_BMI_SR", "SDC_EDU_LEVEL_AGE", "PSE_ADULT_WRK_DURATION", "PM_WAIST_HIP_RATIO_SR", "PA_SIT_AVG_TIME_DAY", "SLE_TIME", "NUT_VEG_QTY", "NUT_FRUITS_QTY", "NUT_JUICE_QTY", "ALC_CUR_FREQ"), names_to = "name", values_to = "value")
ggplot(data = centroids_long, aes(x = name, y = value, group = .cluster, color = .cluster)) +
geom_point() +
geom_line() +
labs(x="", y="Value at cluster center") +
theme(axis.text.x = element_text(angle=45, hjust = 1))
Interpretation: The plot of the cluster centroids shows that there are distinct patterns in the values of the variables for each cluster. For example, Cluster 1 has higher values for SDC_AGE_CALC and PA_TOTAL_SHORT, while Cluster 2 has higher values for PM_BMI_SR and SDC_EDU_LEVEL_AGE. Cluster 3 has higher values for PSE_ADULT_WRK_DURATION and PA_SIT_AVG_TIME_DAY, while Cluster 4 has higher values for ALC_CUR_FREQ. These patterns suggest that the clusters may represent different subgroups of individuals with varying characteristics related to age, physical activity, BMI, education level, work duration, sitting time, and alcohol consumption. Further analysis may be needed to interpret the meaning of these clusters in the context of the data and research question.
kmeans_model_tune <- k_means(num_clusters = tune()) %>%
set_engine("stats")
kmodes_workflow_tune <- workflow() %>%
add_recipe(kmeans_recipe) %>%
add_model(kmeans_model_tune)
folds <- vfold_cv(data, v = 2)
grid <- tibble(num_clusters=1:10)
tuned_model <- tune_cluster(kmodes_workflow_tune,
resamples = folds,
grid = grid,
metrics = cluster_metric_set(silhouette_avg),
control = control_resamples(save_pred = TRUE,
verbose = TRUE,
parallel_over = "everything")
)
collect_metrics(tuned_model) %>% head()
## # A tibble: 6 × 7
## num_clusters .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 silhouette_avg standard NaN 0 NA Preprocessor1…
## 2 2 silhouette_avg standard 0.0956 2 0.00225 Preprocessor1…
## 3 3 silhouette_avg standard 0.0891 2 0.00469 Preprocessor1…
## 4 4 silhouette_avg standard 0.0921 2 0.00353 Preprocessor1…
## 5 5 silhouette_avg standard 0.0865 2 0.00669 Preprocessor1…
## 6 6 silhouette_avg standard 0.0728 2 0.000896 Preprocessor1…
autoplot(tuned_model)
Interpretation: The silhouette method was used to evaluate clustering performance across different values of k. The highest average silhouette width was observed at k = 2 (approximately 0.096), indicating that two clusters provide the best separation among the tested configurations. Although k = 4 also shows relatively strong performance. For this assignment, k = 4 was selected as the optimal number of clusters. However, the overall silhouette values are relatively low, suggesting modest cluster separation in the data.
data2 <- data %>% dplyr::select(-clusters)
set.seed(10)
kmeans_final_model <- k_means(num_clusters = 4) %>%
set_engine("stats")
kmeans_final_model
## K Means Cluster Specification (partition)
##
## Main Arguments:
## num_clusters = 4
##
## Computational engine: stats
kmeans_recipe_final <- recipe(~ ., data = data2) |>
update_role(ID, new_role = "id") |>
step_zv(all_predictors()) |>
step_center(all_numeric_predictors()) |>
step_scale(all_numeric_predictors())
kmeans_recipe_final
set.seed(10)
kmeans_workflow_final <- workflow() |>
add_recipe(kmeans_recipe_final) |>
add_model(kmeans_final_model)
set.seed(10)
kmeans_fit_final <- kmeans_workflow_final |> fit(data=data2)
kmeans_fit_final
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_zv()
## • step_center()
## • step_scale()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K-means clustering with 4 clusters of sizes 4423, 3512, 3301, 2006
##
## Cluster means:
## SDC_AGE_CALC PA_TOTAL_SHORT PM_BMI_SR SDC_EDU_LEVEL_AGE
## 3 -0.7272835 -0.2579418 -0.4554022 -0.19251605
## 4 0.2663898 -0.3231741 0.7748929 -0.01345518
## 1 0.2095763 0.5756346 -0.2792562 0.32719500
## 2 0.7923245 0.1872851 0.1070014 -0.09038765
## PSE_ADULT_WRK_DURATION PM_WAIST_HIP_RATIO_SR PA_SIT_AVG_TIME_DAY SLE_TIME
## 3 -0.4348796 -0.4250043 0.19351286 -0.050096574
## 4 -0.2920426 0.5765997 0.28102106 -0.006145864
## 1 -0.2597760 -0.1763795 -0.53365711 0.006988034
## 2 1.8976305 0.2178487 -0.04050411 0.109717807
## NUT_VEG_QTY NUT_FRUITS_QTY NUT_JUICE_QTY ALC_CUR_FREQ
## 3 -0.2473456 -0.2431350 -0.12233785 -0.3732542
## 4 -0.3657558 -0.3219000 0.12873015 0.1738716
## 1 0.8706258 0.8441120 -0.03233799 0.2058765
## 2 -0.2469552 -0.2893893 0.09758113 0.1797945
##
## Clustering vector:
## [1] 1 2 3 1 2 2 2 4 2 1 1 3 1 3 3 1 3 4 2 2 3 2 1 2 2 2 1 2 2 3 3 3 2 3 4 2
## [37] 1 2 4 3 2 2 3 4 1 2 3 1 4 1 1 4 1 1 3 3 1 1 1 2 1 4 4 2 2 1 1 1 3 2 1 2
## [73] 4 3 4 3 2 1 3 1 1 2 2 1 1 3 2 3 3 1 3 3 1 4 3 2 2 2 3 4 1 3 1 2 4 1 2 2
## [109] 4 2 1 4 2 2 2 2 2 2 1 3 4 2 2 1 3 3 2 1 2 3 1 1 2 2 3 2 1 1 3 1 1 1 2 1
## [145] 1 2 3 2 1 2 1 2 1 2 2 2 4 4 2 1 2 1 1 2 2 2 3 4 3 2 2 2 1 2 3 1 2 2 1 2
## [181] 1 2 3 1 3 4 1 2 1 1 3 4 2 1 3 4 3 2 4 3 3 1 1 1 1 3 4 1 1 3 3 2 3 1 1 3
## [217] 2 2 4 4 2 2 1 1 3 2 4 4 4 2 2 1 4 4 1 1 1 2 1 2 3 2 4 2 3 2 2 4 3 1 2 3
## [253] 1 4 2 1 2 2 3 2 1 2 2 1 1 1 3 4 1 1 1 1 1 3 2 1 1 1 3 2 2 2 4 3 4 2 1 1
## [289] 1 1 1 4 1 4 1 2 2 2 2 2 2 3 1 2 1 1 1 3 2 2 2 3 2 1 2 2 2 2 1 2 2 1 1 2
## [325] 1 4 2 1 4 2 3 4 1 2 1 3 2 1 2 4 1 3 2 4 1 4 2 1 1 1 1 4 1 2 1 2 2 1 1 2
## [361] 3 2 4 1 1 2 3 3 1 1 1 1 2 2 2 3 2 2 4 3 1 4 3 1 2 2 3 1 2 1 1 2 2 1 4 2
## [397] 2 3 2 1 3 3 1 1 1 1 4 4 1 2 3 1 2 2 2 1 1 4 2 2 1 2 4 3 1 3 3 1 1 1 2 3
## [433] 2 2 4 1 2 1 1 1 3 2 1 2 1 1 3 1 1 4 1 1 2 1 1 3 1 1 2 2 4 1 1 1 2 3 4 1
## [469] 1 3 1 1 2 3 2 1 4 1 2 1 4 2 3 3 3 4 1 4 1 4 3 4 1 1 3 1 1 2 1 2 2 2 3 3
## [505] 3 2 2 1 1 1 1 3 2 3 1 1 1 1 1 1 1 4 2 1 3 1 2 1 2 2 2 3 3 1 1 1 1 2 4 1
## [541] 3 2 2 1 2 1 4 3 1 1 3 2 2 1 2 1 2 4 1 2 2 3 4 3 2 4 2 1 2 3 3 3 3 2 3 1
## [577] 1 2 1 4 3 2 3 2 4 1 1 1 2 3 4 1 4 1 2 2 2 3 1 2 2 1 4 3 1 1 3 3 2 1 1 1
## [613] 2 1 2 1 1 1 3 1 4 2 3 4 3 2 3 1 2 2 1 1 2 2 1 4 3 1 3 2 4 2 2 1 3 3 2 1
## [649] 3 4 2 3 1 1 1 2 1 2 3 1 1 4 2 1 1 2 2 2 3 4 1 3 3 2 1 3 1 1 3 2 2 1 2 3
## [685] 2 1 4 3 1 3 1 3 3 3 1 2 1 2 4 2 4 2 4 1 2 3 4 1 1 1 2 1 1 2 1 2 1 1 2 3
## [721] 4 4 1 3 1 2 2 3 2 1 2 2 1 3 1 1 3 2 1 2 3 4 1 1 2 3 1 2 2 3 2 1 3 1 4 1
## [757] 1 1 3 2 3 1 2 4 3 1 1 2 3 1 1 1 2 3 1 1 1 4 1 1 3 2 2 2 1 3 3 2 2 2 3 3
## [793] 3 2 1 1 2 3 1 2 3 1 2 2 1 2 2 1 4 2 4 4 1 2 2 3 1 1 4 4 2 3 1 2 1 2 2 3
## [829] 2 1 3 3 2 1 1 2 1 3 2 4 3 1 2 1 4 2 3 1 2 2 3 2 1 1 3 4 3 1 2 2 2 4 1 1
## [865] 3 2 2 2 2 2 1 1 2 3 2 3 3 2 3 2 2 2 2 2 3 2 2 3 2 1 4 2 3 2 1 3 4 4 1 3
## [901] 2 1 2 2 4 1 3 3 4 4 2 3 1 2 3 3 1 3 1 1 2 1 2 2 1 4 3 2 2 4 1 3 3 1 3 3
## [937] 1 2 1 3 1 2 3 2 1 2 2 3 1 1 1 2 2 4 3 2 2 3 2 1 4 1 1 1 2 1 3 2 4 4 1 1
## [973] 3 2 1 2 4 1 3 2 4 2 4 1 1 2 1 3 1 2 2 4 4 4 3 3 1 1 2 3 2 3 1 3 1 3 1 2
## [1009] 2 2 1 1 2 2 3 1 1 1 2 1 1 2 1 4 3 1 2 1 1 1 2 1 3 4 1 2 4 2 1 3 2 4 1 4
## [1045] 1 3 2 2 3 2 2 3 3 1 1 1 3 3 3 2 2 1 1 1 2 3 2 4 2 3 3 1 4 2 1 1 1 2 1 1
##
## ...
## and 347 more lines.
clusters <-kmeans_fit_final |> extract_cluster_assignment()
clusters <- as.data.frame(clusters)
names(clusters) <- c("cluster")
data2$clusters <- clusters$cluster
kmodel_workflow_tune <- workflow() %>%
add_recipe(kmeans_recipe_final) %>%
add_model(kmeans_model_tune)
folds <- vfold_cv(data, v = 2)
grid <- tibble(num_clusters=1:10)
tuned_model <- tune_cluster(kmodes_workflow_tune,
resamples = folds,
grid = grid,
metrics = cluster_metric_set(silhouette_avg),
control = control_resamples(save_pred = TRUE,
verbose = TRUE,
parallel_over = "everything")
)
collect_metrics(tuned_model) %>% head()
## # A tibble: 6 × 7
## num_clusters .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 silhouette_avg standard NaN 0 NA Preprocessor1…
## 2 2 silhouette_avg standard 0.0956 2 0.00225 Preprocessor1…
## 3 3 silhouette_avg standard 0.0891 2 0.00469 Preprocessor1…
## 4 4 silhouette_avg standard 0.0921 2 0.00353 Preprocessor1…
## 5 5 silhouette_avg standard 0.0865 2 0.00669 Preprocessor1…
## 6 6 silhouette_avg standard 0.0728 2 0.000896 Preprocessor1…
set.seed(10)
data_split <- initial_split(data, prop = 0.70)
# Create data frames for the two sets:
train_data <- training(data_split)
summary(train_data$PM_BMI_SR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.86 23.37 26.45 27.32 30.13 67.31
test_data <- testing(data_split)
summary(test_data$PM_BMI_SR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.86 23.40 26.46 27.37 30.16 69.40
linear_model <- linear_reg() %>%
set_engine("glm") %>%
set_mode("regression")
linear_model
## Linear Regression Model Specification (regression)
##
## Computational engine: glm
pca_baseline_recipe <- recipe(PM_BMI_SR ~., data = train_data) %>%
update_role(ID, new_role = "id") %>%
step_scale(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_pca(all_numeric_predictors(), num_comp =7, id = "pca_id") |>
step_zv(all_predictors())
pca_baseline_recipe
baseline_bmi_workflow <-
workflow() %>%
add_model(linear_model) %>%
add_recipe(pca_baseline_recipe)
baseline_bmi_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
##
## • step_scale()
## • step_center()
## • step_pca()
## • step_zv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Computational engine: glm
baseline_bmi_fit <-
baseline_bmi_workflow %>%
fit(data = train_data)
options(scipen = 999, digits = 3)
baseline_bmi_fit_extract <- baseline_bmi_fit %>%
extract_fit_parsnip() %>%
tidy()
baseline_bmi_fit_extract
## # A tibble: 11 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 24.1 0.115 211. 0
## 2 clustersCluster_2 7.94 0.157 50.6 0
## 3 clustersCluster_3 1.40 0.194 7.24 4.70e-13
## 4 clustersCluster_4 4.63 0.249 18.6 1.45e-75
## 5 PC1 -0.538 0.0608 -8.85 9.99e-19
## 6 PC2 -0.168 0.0627 -2.68 7.42e- 3
## 7 PC3 -0.238 0.0487 -4.88 1.08e- 6
## 8 PC4 0.108 0.0502 2.15 3.13e- 2
## 9 PC5 0.571 0.0535 10.7 2.33e-26
## 10 PC6 -0.117 0.0528 -2.22 2.65e- 2
## 11 PC7 0.572 0.0523 10.9 1.12e-27
baseline_bmi_predicted <- augment(baseline_bmi_fit, test_data)
ggplot(baseline_bmi_predicted, aes(x = PM_BMI_SR, y = .pred)) +
geom_point(alpha = 0.1) +
geom_smooth(method = "lm") +
labs(x = "Measured BMI", y = "Predicted BMI") +
theme_minimal()
pred_true <- test_data |>
dplyr::select(PM_BMI_SR) |>
bind_cols(baseline_bmi_predicted)
head(pred_true)
## # A tibble: 6 × 17
## PM_BMI_SR...1 .pred .resid ID SDC_AGE_CALC PA_TOTAL_SHORT PM_BMI_SR...7
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 22.5 24.8 -2.32 SYN_58624 58 594 22.5
## 2 32.2 25.2 7.00 SYN_5862… 55 5332. 32.2
## 3 39.5 25.7 13.8 SYN_5862… 30 66 39.5
## 4 28.3 32.8 -4.47 SYN_5862… 46 99 28.3
## 5 30.6 31.6 -0.972 SYN_5862… 62 4788 30.6
## 6 47.7 30.9 16.8 SYN_5862… 60 480 47.7
## # ℹ 10 more variables: SDC_EDU_LEVEL_AGE <dbl>, PSE_ADULT_WRK_DURATION <dbl>,
## # PM_WAIST_HIP_RATIO_SR <dbl>, PA_SIT_AVG_TIME_DAY <dbl>, SLE_TIME <dbl>,
## # NUT_VEG_QTY <dbl>, NUT_FRUITS_QTY <dbl>, NUT_JUICE_QTY <dbl>,
## # ALC_CUR_FREQ <dbl>, clusters <fct>
set.seed(10)
# Create a split object
data_split2 <- initial_split(data2, prop = 0.70)
# Build training and testing data set
train_data2 <- training(data_split2)
test_data2 <- testing(data_split2)
summary(train_data2$PM_BMI_SR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.86 23.37 26.45 27.32 30.13 67.31
summary(test_data2$PM_BMI_SR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.86 23.40 26.46 27.37 30.16 69.40
table(train_data2$clusters)
##
## Cluster_1 Cluster_2 Cluster_3 Cluster_4
## 3097 2477 2286 1409
table(test_data2$clusters)
##
## Cluster_1 Cluster_2 Cluster_3 Cluster_4
## 1326 1035 1015 597
linear_model2 <- linear_reg() |>
set_engine("glm") |>
set_mode("regression")
cluster_reg_recipe <- recipe(PM_BMI_SR ~ clusters, data = train_data2) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors())
cluster_reg_recipe
# Workflow
bmi_workflow_2 <-
workflow() %>%
add_model(linear_model2) %>%
add_recipe(cluster_reg_recipe)
bmi_workflow_2
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_dummy()
## • step_zv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Computational engine: glm
bmi_fit_2 <-
bmi_workflow_2 |>
fit(data = train_data2)
options(scipen = 999, digits = 3)
bmi_fit_extract_2 <- bmi_fit_2 |>
extract_fit_parsnip() |>
tidy()
bmi_fit_extract_2
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 28.0 0.134 210. 0
## 2 clusters_Cluster_1 -3.43 0.161 -21.3 3.71e- 98
## 3 clusters_Cluster_2 3.73 0.167 22.3 4.28e-107
## 4 clusters_Cluster_3 -2.34 0.170 -13.8 1.29e- 42
## 5 clusters_Cluster_4 NA NA NA NA
bmi_predicted_2 <- augment(bmi_fit_2, test_data2)
bmi_predicted_2
## # A tibble: 3,973 × 16
## .pred .resid ID SDC_AGE_CALC PA_TOTAL_SHORT PM_BMI_SR SDC_EDU_LEVEL_AGE
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 24.6 -2.16 SYN_586… 58 594 22.5 17
## 2 25.7 6.47 SYN_586… 55 5332. 32.2 28
## 3 24.6 14.9 SYN_586… 30 66 39.5 26
## 4 31.8 -3.45 SYN_586… 46 99 28.3 22
## 5 31.8 -1.18 SYN_586… 62 4788 30.6 20
## 6 31.8 16.0 SYN_586… 60 480 47.7 18
## 7 31.8 20.6 SYN_586… 48 0 52.4 19
## 8 25.7 1.75 SYN_586… 31 4932 27.5 18
## 9 25.7 4.75 SYN_586… 69 9198 30.5 45
## 10 25.7 1.63 SYN_586… 66 10430 27.3 25
## # ℹ 3,963 more rows
## # ℹ 9 more variables: PSE_ADULT_WRK_DURATION <dbl>,
## # PM_WAIST_HIP_RATIO_SR <dbl>, PA_SIT_AVG_TIME_DAY <dbl>, SLE_TIME <dbl>,
## # NUT_VEG_QTY <dbl>, NUT_FRUITS_QTY <dbl>, NUT_JUICE_QTY <dbl>,
## # ALC_CUR_FREQ <dbl>, clusters <fct>
pred_true_2 <- test_data2 |>
dplyr::select(PM_BMI_SR) |>
bind_cols(bmi_predicted_2)
head(pred_true_2)
## # A tibble: 6 × 17
## PM_BMI_SR...1 .pred .resid ID SDC_AGE_CALC PA_TOTAL_SHORT PM_BMI_SR...7
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 22.5 24.6 -2.16 SYN_58624 58 594 22.5
## 2 32.2 25.7 6.47 SYN_5862… 55 5332. 32.2
## 3 39.5 24.6 14.9 SYN_5862… 30 66 39.5
## 4 28.3 31.8 -3.45 SYN_5862… 46 99 28.3
## 5 30.6 31.8 -1.18 SYN_5862… 62 4788 30.6
## 6 47.7 31.8 16.0 SYN_5862… 60 480 47.7
## # ℹ 10 more variables: SDC_EDU_LEVEL_AGE <dbl>, PSE_ADULT_WRK_DURATION <dbl>,
## # PM_WAIST_HIP_RATIO_SR <dbl>, PA_SIT_AVG_TIME_DAY <dbl>, SLE_TIME <dbl>,
## # NUT_VEG_QTY <dbl>, NUT_FRUITS_QTY <dbl>, NUT_JUICE_QTY <dbl>,
## # ALC_CUR_FREQ <dbl>, clusters <fct>
baseline_metrics <- baseline_bmi_predicted %>%
metrics(truth = PM_BMI_SR, estimate = .pred)
tuned_metrics <- bmi_predicted_2 %>%
metrics(truth = PM_BMI_SR, estimate = .pred)
baseline_metrics
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 5.01
## 2 rsq standard 0.277
## 3 mae standard 3.74
tuned_metrics
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 5.12
## 2 rsq standard 0.245
## 3 mae standard 3.82
Interpretation: The baseline PCA regression model has an RMSE of approximately 5.01, while the linear regression model with the cluster variable has an RMSE of approximately 5.12. This suggests that the baseline model with the cluster variable has a slightly better predictive performance compared to the tuned PCA model. However, the difference in RMSE is relatively small, indicating that both models have similar predictive accuracy for BMI in this dataset. Further analysis may be needed to determine if the improvement in performance is statistically significant and to explore other potential factors that could influence BMI predictions.
coeff <- tidy(bmi_fit_2) %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) > 0.5)
knitr::kable(coeff)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 28.05 | 0.134 | 209.8 | 0 |
| clusters_Cluster_2 | 3.73 | 0.167 | 22.3 | 0 |
| clusters_Cluster_1 | -3.43 | 0.161 | -21.3 | 0 |
| clusters_Cluster_3 | -2.34 | 0.170 | -13.8 | 0 |
ggplot(coeff, aes(x = term, y = estimate, fill = term)) + geom_col() + coord_flip()
Interpretation: The feature importance plot shows that the cluster variable has a significant impact on the predicted BMI.Cluster 2 appears to represent the highest BMI group. Clusters 1 and 3 have lower BMI than the reference group.
sessionInfo()
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Regina
## tzcode source: internal
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] knitr_1.51 GGally_2.4.0 tidyclust_0.2.4 see_0.12.0
## [5] performance_0.15.3 corrr_0.4.5 themis_1.0.3 glmnet_4.1-10
## [9] Matrix_1.7-4 vip_0.4.5 mlbench_2.1-6 gtsummary_2.5.0
## [13] finalfit_1.1.0 psych_2.5.6 plotly_4.12.0 sjPlot_2.9.0
## [17] reportRmd_0.1.1 yardstick_1.3.2 workflowsets_1.1.1 workflows_1.3.0
## [21] tune_2.0.1 tailor_0.1.0 rsample_1.3.1 recipes_1.3.1
## [25] parsnip_1.4.1 modeldata_1.5.1 infer_1.1.0 dials_1.4.2
## [29] scales_1.4.0 broom_1.0.11 tidymodels_1.4.1 lubridate_1.9.4
## [33] forcats_1.0.1 stringr_1.6.0 dplyr_1.1.4 purrr_1.2.0
## [37] readr_2.1.6 tidyr_1.3.2 tibble_3.3.0 ggplot2_4.0.1
## [41] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.5.2 hardhat_1.4.2 rpart_4.1.24
## [4] sparsevctrs_0.3.5 lifecycle_1.0.4 Rdpack_2.6.4
## [7] rstatix_0.7.3 globals_0.18.0 lattice_0.22-7
## [10] vroom_1.6.7 MASS_7.3-65 insight_1.4.4
## [13] crosstalk_1.2.2 backports_1.5.0 magrittr_2.0.4
## [16] sass_0.4.10 rmarkdown_2.30 jquerylib_0.1.4
## [19] yaml_2.3.12 otel_0.2.0 cowplot_1.2.0
## [22] minqa_1.2.8 RColorBrewer_1.1-3 abind_1.4-8
## [25] nnet_7.3-20 ipred_0.9-15 lava_1.8.2
## [28] seriation_1.5.8 listenv_0.10.0 parallelly_1.46.1
## [31] svglite_2.2.2 codetools_0.2-20 xml2_1.5.1
## [34] tidyselect_1.2.1 shape_1.4.6.1 farver_2.1.2
## [37] lme4_1.1-38 TSP_1.2.6 stats4_4.5.2
## [40] jsonlite_2.0.0 mitml_0.4-5 Formula_1.2-5
## [43] survival_3.8-3 iterators_1.0.14 systemfonts_1.3.1
## [46] foreach_1.5.2 tools_4.5.2 cmprsk_2.2-12
## [49] Rcpp_1.1.0 glue_1.8.0 mnormt_2.1.1
## [52] prodlim_2025.04.28 gridExtra_2.3 pan_1.9
## [55] xfun_0.55 mgcv_1.9-3 flexclust_1.5.0
## [58] ca_0.71.1 withr_3.0.2 ROSE_0.0-4
## [61] fastmap_1.2.0 boot_1.3-32 digest_0.6.39
## [64] timechange_0.3.0 R6_2.6.1 mice_3.19.0
## [67] textshaping_1.0.4 utf8_1.2.6 generics_0.1.4
## [70] data.table_1.18.0 class_7.3-23 httr_1.4.7
## [73] htmlwidgets_1.6.4 ggstats_0.12.0 pkgconfig_2.0.3
## [76] gtable_0.3.6 timeDate_4051.111 modeltools_0.2-24
## [79] registry_0.5-1 GPfit_1.0-9 S7_0.2.1
## [82] furrr_0.3.1 htmltools_0.5.9 carData_3.0-5
## [85] kableExtra_1.4.0 gower_1.0.2 reformulas_0.4.3.1
## [88] rstudioapi_0.17.1 tzdb_0.5.0 nlme_3.1-168
## [91] nloptr_2.2.1 cachem_1.1.0 pillar_1.11.1
## [94] grid_4.5.2 vctrs_0.6.5 ggpubr_0.6.2
## [97] car_3.1-3 jomo_2.7-6 cluster_2.1.8.1
## [100] lhs_1.2.0 evaluate_1.0.5 cli_3.6.5
## [103] compiler_4.5.2 rlang_1.1.6 crayon_1.5.3
## [106] future.apply_1.20.1 ggsignif_0.6.4 labeling_0.4.3
## [109] stringi_1.8.7 modelenv_0.2.0 philentropy_0.10.0
## [112] viridisLite_0.4.2 lazyeval_0.2.2 geepack_1.3.13
## [115] aod_1.3.3 hms_1.1.4 bit64_4.6.0-1
## [118] future_1.69.0 rbibutils_2.4 RcppParallel_5.1.11-1
## [121] bslib_0.9.0 bit_4.6.0 DiceDesign_1.10